# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

from gushici.items import GushiciItem
import re

class ShiciSpider(CrawlSpider):
    name = 'shici'
    allowed_domains = ['www.shicimingju.com']
    start_urls = ['http://www.shicimingju.com/']

    rules = (
        Rule(LinkExtractor(allow=r'/\d+\.html',unique=True), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        shici_container = response.xpath('//div[@class="shici-container www-shadow-card"]')
        if shici_container:
            title = shici_container.xpath('./h1[@class="shici-title"]/text()').extract_first().strip()
            author = shici_container.xpath('./div[@class="shici-info"]/a/text()').extract_first().strip()
            content = shici_container.xpath('string(./div[@class="shici-content"])').extract_first().strip()
            tags = ','.join(shici_container.xpath('./div[@class="shici-mark"]/a/text()').extract())
            dynasty = re.sub('[\[\]]','',shici_container.xpath('./div[@class="shici-info"]/text()').extract_first()).strip()
            yield GushiciItem(title=title,author=author,content=content,dynasty=dynasty,tags=tags)
